library(ggplot2)
library(maps)
library(ggmap)
setwd("D:/Google Drive/College/4-The Analytics Edge/data7")
murders = read.csv("murders.csv")
mvt = read.csv("mvt.csv")
summary(mvt)
## Date Latitude Longitude
## 5/16/08 0:00 : 11 Min. :41.64 Min. :-87.93
## 10/17/01 22:00: 10 1st Qu.:41.77 1st Qu.:-87.72
## 4/13/04 21:00 : 10 Median :41.85 Median :-87.68
## 9/17/05 22:00 : 10 Mean :41.84 Mean :-87.68
## 10/12/01 22:00: 9 3rd Qu.:41.92 3rd Qu.:-87.64
## 10/13/01 22:00: 9 Max. :42.02 Max. :-87.52
## (Other) :191582 NA's :2276 NA's :2276
str(mvt)
## 'data.frame': 191641 obs. of 3 variables:
## $ Date : Factor w/ 131680 levels "1/1/01 0:01",..: 42824 42823 42823 42823 42822 42821 42820 42819 42817 42816 ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ...
# Convert the Date variable to a format that R will recognize:
mvt$Date = strptime(mvt$Date, format="%m/%d/%y %H:%M")
# Extract the hour and the day of the week:
mvt$Weekday = weekdays(mvt$Date)
mvt$Hour = mvt$Date$hour
# Let's take a look at the structure of our data again:
str(mvt)
## 'data.frame': 191641 obs. of 5 variables:
## $ Date : POSIXlt, format: "2012-12-31 23:15:00" "2012-12-31 22:00:00" ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude: num -87.6 -87.7 -87.8 -87.7 -87.6 ...
## $ Weekday : chr "Monday" "Monday" "Monday" "Monday" ...
## $ Hour : int 23 22 22 22 21 20 20 20 19 18 ...
# Create a simple line plot - need the total number of crimes on each day of the week. We can get this information by creating a table:
table(mvt$Weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 29284 27397 27118 26316 27319 26791 27416
# Save this table as a data frame:
WeekdayCounts = as.data.frame(table(mvt$Weekday))
str(WeekdayCounts)
## 'data.frame': 7 obs. of 2 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7
## $ Freq: int 29284 27397 27118 26316 27319 26791 27416
# Create our plot, group = 1 make it to one line.
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))
# Make the "Var1" variable an chronological order instead of an alphabetical order.
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))
# Try again:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) +
geom_line(aes(group=1))
# Change our x and y labels:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1), linetype=2, alpha=0.3) + xlab("Day of the Week") +
ylab("Total Motor Vehicle Thefts")
# Create a counts table for the weekday and hour:
table(mvt$Weekday, mvt$Hour)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## Friday 1873 932 743 560 473 602 839 1203 1268 1286 938 822
## Monday 1900 825 712 527 415 542 772 1123 1323 1235 971 737
## Saturday 2050 1267 985 836 652 508 541 650 858 1039 946 789
## Sunday 2028 1236 1019 838 607 461 478 483 615 864 884 787
## Thursday 1856 816 696 508 400 534 799 1135 1298 1301 932 731
## Tuesday 1691 777 603 464 414 520 845 1118 1175 1174 948 786
## Wednesday 1814 790 619 469 396 561 862 1140 1329 1237 947 763
##
## 12 13 14 15 16 17 18 19 20 21 22 23
## Friday 1207 857 937 1140 1165 1318 1623 1652 1736 1881 2308 1921
## Monday 1129 824 958 1059 1136 1252 1518 1503 1622 1815 2009 1490
## Saturday 1204 767 963 1086 1055 1084 1348 1390 1570 1702 2078 1750
## Sunday 1192 789 959 1037 1083 1160 1389 1342 1706 1696 2079 1584
## Thursday 1093 752 831 1044 1131 1258 1510 1537 1668 1776 2134 1579
## Tuesday 1108 762 908 1071 1090 1274 1553 1496 1696 1816 2044 1458
## Wednesday 1225 804 863 1075 1076 1289 1580 1507 1718 1748 2093 1511
# Save this to a data frame:
DayHourCounts = as.data.frame(table(mvt$Weekday, mvt$Hour))
str(DayHourCounts)
## 'data.frame': 168 obs. of 3 variables:
## $ Var1: Factor w/ 7 levels "Friday","Monday",..: 1 2 3 4 5 6 7 1 2 3 ...
## $ Var2: Factor w/ 24 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ Freq: int 1873 1900 2050 2028 1856 1691 1814 932 825 1267 ...
# Convert the second variable, Var2, from factor from numeric
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))
# Create out plot:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) +
geom_line(aes(group=Var1))
# Change the colors
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) +
geom_line(aes(group=Var1, color=Var1), size=2)
# Separate the weekends from the weekdays:
DayHourCounts$Type = ifelse(
(DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"),
"Weekend", "Weekday")
# Redo our plot, this time coloring by Type:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) +
geom_line(aes(group=Var1, color=Type), size=2)
# Make the lines a little transparent:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) +
geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5)
# Fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
# Make a heatmap:
ggplot(DayHourCounts, aes(x=Hour, y=Var1)) +
geom_tile(aes(fill=Freq))
# Change the label on the legend, and get rid of the y-label:
ggplot(DayHourCounts, aes(x=Hour, y=Var1)) +
geom_tile(aes(fill=Freq)) +
scale_fill_gradient(name="Total MV Thefts") +
theme(axis.title.y=element_blank())
# Change the color scheme
ggplot(DayHourCounts, aes(x=Hour, y=Var1)) +
geom_tile(aes(fill=Freq)) +
scale_fill_gradient(name="Total MV Thefts", low="white", high="red") +
theme(axis.title.y = element_blank())
# Load a map of Chicago into R:
chicago = get_map(location="chicago", zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=chicago&sensor=false
ggmap(chicago)
# Plot the first 100 motor vehicle thefts:
ggmap(chicago) + geom_point(data=mvt[1:100,], aes(x=Longitude, y=Latitude))
## Warning: Removed 7 rows containing missing values (geom_point).
# Round our latitude and longitude to 2 digits of accuracy, and create a crime counts data frame for each area:
LatLonCounts = as.data.frame(table(round(mvt$Longitude,2), round(mvt$Latitude,2)))
str(LatLonCounts)
## 'data.frame': 1638 obs. of 3 variables:
## $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq: int 0 0 0 0 0 0 0 0 0 0 ...
# Convert our Longitude and Latitude variable from factor var. to numeric var.
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))
# Plot these points on our map:
ggmap(chicago) +
geom_point(data=LatLonCounts, aes(x=Long, y=Lat, color=Freq, size=Freq))
## Warning: Removed 615 rows containing missing values (geom_point).
# Change the color scheme:
ggmap(chicago) +
geom_point(data=LatLonCounts, aes(x=Long, y=Lat, color=Freq, size=Freq)) +
scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 615 rows containing missing values (geom_point).
# We can also use the geom_tile geometry
ggmap(chicago) + geom_tile(data=LatLonCounts, aes(x=Long, y=Lat, alpha=Freq), fill="red")
# Remove red squre in the water
LatLonCounts2 = subset(LatLonCounts, Freq > 0)
ggmap(chicago) +
geom_tile(data=LatLonCounts2, aes(x=Long, y=Lat, alpha=Freq), fill="red")
str(murders)
## 'data.frame': 51 obs. of 6 variables:
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Population : int 4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
## $ PopulationDensity: num 94.65 1.26 57.05 56.43 244.2 ...
## $ Murders : int 199 31 352 130 1811 117 131 48 131 987 ...
## $ GunMurders : int 135 19 232 93 1257 65 97 38 99 669 ...
## $ GunOwnership : num 0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...
# Load the map of the US
statesMap = map_data("state")
str(statesMap)
## 'data.frame': 15537 obs. of 6 variables:
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ subregion: chr NA NA NA NA ...
# Plot the map:
ggplot(statesMap, aes(x=long, y=lat, group=group)) +
geom_polygon(fill="white", color="black")
# Preprocess to merge the two data frame by
# Create a new variable called region with the lowercase names to match the statesMap:
murders$region = tolower(murders$State)
# Join the statesMap data and the murders data into one dataframe:
# by="region" the identifier to use to merge the rows.
murderMap = merge(statesMap, murders, by="region")
str(murderMap)
## 'data.frame': 15537 obs. of 12 variables:
## $ region : chr "alabama" "alabama" "alabama" "alabama" ...
## $ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ lat : num 30.4 30.4 30.4 30.3 30.3 ...
## $ group : num 1 1 1 1 1 1 1 1 1 1 ...
## $ order : int 1 2 3 4 5 6 7 8 9 10 ...
## $ subregion : chr NA NA NA NA ...
## $ State : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Population : int 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
## $ PopulationDensity: num 94.7 94.7 94.7 94.7 94.7 ...
## $ Murders : int 199 199 199 199 199 199 199 199 199 199 ...
## $ GunMurders : int 135 135 135 135 135 135 135 135 135 135 ...
## $ GunOwnership : num 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
# Plot the number of murder on our map of the United States:
ggplot(murderMap, aes(x=long, y=lat, group=group, fill=Murders)) +
geom_polygon(color="black") +
scale_fill_gradient(low="black", high="red", guide="legend")
# Plot a map of the population:
ggplot(murderMap, aes(x=long, y=lat, group=group, fill=Population)) +
geom_polygon(color="black") +
scale_fill_gradient(low="black", high="red", guide="legend")
We have a population map here which looks exactly the same as our murders map. So we need to plot the murder rate instead of the number of murders to make sure we’re not just plotting a population map.
# Create a new variable that is the number of murders per 100,000 population:
murderMap$MurderRate = murderMap$Murders / murderMap$Population * 100000
# Redo our plot with murder rate:
ggplot(murderMap, aes(x=long, y=lat, group=group, fill=MurderRate)) +
geom_polygon(color="black") +
scale_fill_gradient(low="black", high="red", guide="legend")
# Redo the plot, removing any states with murder rates above 10:
ggplot(murderMap, aes(x=long, y=lat, group=group, fill=MurderRate)) +
geom_polygon(color="black") +
scale_fill_gradient(low="black", high="red", guide="legend", limits=c(0,10))
# Create a new variable that is the number of gun owner per 100,000 population:
murderMap$GunOwnerRate = murderMap$GunOwnership / murderMap$Population * 100000
# Redo our plot with murder rate:
ggplot(murderMap, aes(x=long, y=lat, group=group, fill=GunOwnerRate)) +
geom_polygon(color="black") +
scale_fill_gradient(low="black", high="red", guide="legend")
heatmap can visualize data that will be too big for a talbe.
Plotting data on maps is much more effeict that a table for location based on data.